Code
library (tidyverse)
library (magrittr)
library (gt)
library (patchwork)
library (SummarizedExperiment)
library (tidySummarizedExperiment)
library (labelled)
library (vegan)
library (RColorBrewer)
# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs:: dir_map ("R scripts/" , source)
theme_set (theme_light ())
Load the data
Code
data_source <- "real"
data_dir <- get_data_dir (data_source)
if (data_source == TRUE ) {
mg_dir <- str_c (data_dir, "03 metagenomics combined/" )
counts <- read_csv (str_c (mg_dir, "mg_combined.csv" ))
manifest <- read_csv (str_c (mg_dir, "mg_combined_manifest.csv" ))
} else {
mg_dir <- str_c (data_dir, "02 Metagenomics/" )
counts <- read_csv (str_c (mg_dir, "MVIBR_kSanityVIRGO2_ReadCounts_20250404.csv" ))
counts_corr <- read_csv (str_c (mg_dir, "MVIBR_kSanityVIRGO2_taxaGLcor_20250404.csv" ))
relabs <- read_csv (str_c (mg_dir, "MVIBR_kSanityVIRGO2_RelAbund_20250404.csv" ))
technical_metadata <- read_csv (str_c (mg_dir, "VIBRANT_MG_technicalMetaData_20250404.csv" ))
LBP_strain_info <- readxl:: read_xlsx (str_c (data_dir, "00 Trial Data/IsolateNumbers.xlsx" ))
}
The names of the strains do not correspond exactly to those in the metagenomics files. We will modify the names of the strains in LBP_strain_info to match them.
Code
LBP_strain_info <-
LBP_strain_info |>
mutate (strain_id = ` VMRC ID ` ,
LBP = ifelse (is.na (LC106), "LC-115" , "LC-106 & LC-115" ) |> factor (),
strain_origin = ` Geographic source ` |> factor ()
) |>
dplyr:: rename (Biose_ID = ` Biose ID ` ) |> #VMRC_ID = `VMRC ID`
dplyr:: select (strain_id, LBP, strain_origin, Biose_ID) |>
arrange (strain_origin, LBP) |>
mutate (strain_id = strain_id |> fct_inorder ()) |>
dplyr:: select (strain_id, LBP, strain_origin, contains ("ID" )) |>
mutate (strain_id = sub ("_.*" , "" , strain_id),
strain_id = case_when (strain_id == "CC0028A1" ~ "C0028A1" ,
TRUE ~ strain_id
))
LBP_strain_info |>
set_variable_labels (
strain_id = "Strain ID" ,
strain_origin = "Strain origin" ,
Biose_ID = "Biose ID" ,
#VMRC_ID = "VMRC ID"
) |>
gt (caption = "LBP strain information" ) |>
tab_style (style = cell_text (weight = "bold" ),
locations = cells_column_labels ())
LBP strain information
FF00018
LC-106 & LC-115
SA
GA08
FF00051
LC-106 & LC-115
SA
GA09
UC101
LC-106 & LC-115
SA
GA12
FF00004
LC-115
SA
GA07
FF00064
LC-115
SA
GA10
FF00072
LC-115
SA
GA11
UC119
LC-115
SA
GA13
122010
LC-115
SA
GA14
185329
LC-115
SA
GA15
C0059E1
LC-106 & LC-115
US
GA03
C0022A1
LC-106 & LC-115
US
GA04
C0175A1
LC-106 & LC-115
US
GA06
C0006A1
LC-115
US
GA01
C0028A1
LC-115
US
GA02
C0112A1
LC-115
US
GA05
Create a SummarizedExperiment object
NOTE : not possible for the moment because the number of columns does not correspond between the different assays. counts includes an extra column “multiGenera”. For now, we remove the “MuliGenra” from the counts data frame.
Code
mg_to_SE <- function (counts, counts_corr, relabs, LBP_strain_info, technical_metadata) {
technical_metadata <-
technical_metadata |>
filter (is.na (LibrarySequencedTwice))
# suppress "multiGenera" from counts
counts <-
counts |>
select (- c (MultiGenera))
assay_counts <-
counts |>
dplyr:: select (- c (CST, subCST, score)) |>
as.data.frame () |>
column_to_rownames ("sampleID" ) |>
drop_na () |>
t ()
assay_counts_corr <-
counts_corr |>
dplyr:: select (- c (CST, subCST, score)) |>
as.data.frame () |>
column_to_rownames ("sampleID" ) |>
t ()
assay_relative_ab <-
relabs |>
dplyr:: select (- c (CST, subCST, score)) |>
as.data.frame () |>
column_to_rownames ("sampleID" ) |>
t ()
# include a total reads per sample in the colData
se_coldata <-
counts |>
rowwise () |>
mutate (total_non_human_reads = sum (c_across (- c (sampleID, CST, subCST, score)))) |>
select (sampleID, total_non_human_reads) |>
left_join (
technical_metadata |> filter (is.na (LibrarySequencedTwice)),
by = c ("sampleID" = "UID" )
) |>
as.data.frame ()
se_rowdata <-
counts |>
dplyr:: select (- c (sampleID, CST, subCST, score)) |>
colnames () |>
as.data.frame () |>
setNames ("strain" ) |>
distinct () |>
left_join (
LBP_strain_info,
by = join_by (strain == strain_id)
)
# Harmonization of the order of samples and feature
sorted_sample_ids <- sort (as.character (se_coldata$ sampleID))
assay_counts <- assay_counts[, sorted_sample_ids]
assay_counts_corr <- assay_counts_corr[, sorted_sample_ids]
assay_relative_ab <- assay_relative_ab[, sorted_sample_ids]
se_coldata <- se_coldata[match (sorted_sample_ids, se_coldata$ sampleID), ]
sorted_strains <- assay_relative_ab |> rownames () # sort(as.character(se_rowdata$strain))
assay_counts <- assay_counts[sorted_strains, ]
assay_counts_corr <- assay_counts_corr[sorted_strains, ]
assay_relative_ab <- assay_relative_ab[sorted_strains, ]
se_rowdata <- se_rowdata[match (sorted_strains, se_rowdata$ strain), ]
SummarizedExperiment:: SummarizedExperiment (
assays = list (counts = assay_counts, counts_corr = assay_counts_corr, relabs = assay_relative_ab),
rowData = se_rowdata,
colData = se_coldata
)
}
Code
SE_mg <- mg_to_SE (counts, counts_corr, relabs, LBP_strain_info, technical_metadata)
(include a total count per sample in the colData total_non_human_reads) > OK
assay_name = counts
Exploratory & QC analyses
Code
# A tibble: 751,548 × 29
.feature .sample counts counts_corr relabs sampleID total_non_human_reads
<chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
1 122010 MG_1010 0 0 0 MG_1010 9671181.
2 185329 MG_1010 0 0 0 MG_1010 9671181.
3 C0006A1 MG_1010 0 0 0 MG_1010 9671181.
4 C0022A1 MG_1010 1408234. 1494. 0.137 MG_1010 9671181.
5 C0028A1 MG_1010 0 0 0 MG_1010 9671181.
6 C0059E1 MG_1010 268235. 285. 0.0261 MG_1010 9671181.
7 C0112A1 MG_1010 0 0 0 MG_1010 9671181.
8 C0175A1 MG_1010 6571758. 6971. 0.639 MG_1010 9671181.
9 FF00004 MG_1010 0 0 0 MG_1010 9671181.
10 FF00018 MG_1010 0 0 0 MG_1010 9671181.
# ℹ 751,538 more rows
# ℹ 22 more variables: SampleType <chr>, Ext_Lib_Plate <dbl>,
# Ext_Lib_Plate_ID <dbl>, Ext_Lib_Position <chr>, SequencingRun <chr>,
# DateSequenced <dbl>, Lane <dbl>, Sample <chr>, `Library Pool` <chr>,
# Library <chr>, FragmentSize <dbl>, IndexSet <chr>, `Index 1` <chr>,
# `Index 2` <chr>, BioinformaticsProcessingBatch <dbl>,
# `Selected4re-extraction` <dbl>, LibrarySequencedTwice <dbl>, Notes <chr>, …
###Total number of counts/relative abundances per sample
Total number of counts and corrected counts per sample
Code
SE_mg |>
as.tibble () |>
group_by (.sample) |>
summarise (total_count = sum (counts)) |>
ggplot (aes (x = total_count)) +
geom_histogram () +
scale_x_log10 () +
labs (x = "Total number of counts per sample" ,
y = "Number of samples" )
Code
SE_mg |>
as.tibble () |>
group_by (.sample) |>
summarise (total_count = sum (counts_corr)) |>
ggplot (aes (x = total_count)) +
geom_histogram () +
scale_x_log10 () +
labs (x = "Total number of corected counts per sample" ,
y = "Number of samples" )
A sample has a very small total number of counts
Code
SE_mg |>
group_by (.sample) |>
as.tibble () |>
filter (total_non_human_reads < 1e5 ) |>
distinct (.sample) |>
print ()
# A tibble: 1 × 1
.sample
<chr>
1 MG_EQ05822476
Total proportion per sample
Code
SE_mg |>
as.tibble () |>
group_by (.sample) |>
summarise (total_relab = sum (relabs)) |>
ggplot (aes (x = total_relab)) +
geom_histogram () +
labs (x = "Total proportion per sample" ,
y = "Number of samples" )
Missing values
Code
SE_mg |>
as.tibble () |>
ggplot (aes (x= .feature, y = .sample, fill = is.na (counts))) +
geom_tile () +
scale_x_discrete ("Strains" ) +
scale_y_discrete ("Species" ,breaks = NULL ) +
labs (title = "Missing values in readcounts" ) +
scale_fill_manual (
name = "Statut" ,
values = c ("TRUE" = "pink" , "FALSE" = "steelblue2" ),
labels = c ("FALSE" = "Not missing" , "TRUE" = "Missing" )) +
theme (axis.text.x = element_blank (),
axis.ticks.x = element_blank (),
plot.title = element_text (hjust = 0.5 ))
Code
SE_mg |>
as.tibble () |>
ggplot (aes (x= .feature, y = .sample, fill = is.na (counts_corr))) +
geom_tile () +
scale_x_discrete ("Species" ) +
scale_y_discrete ("Samples" ,breaks = NULL ) +
labs (title = "Missing values in corrected readcounts" ) +
scale_fill_manual (
name = "Statut" ,
values = c ("TRUE" = "pink" , "FALSE" = "steelblue2" ),
labels = c ("FALSE" = "Not missing" , "TRUE" = "Missing" )) +
theme (axis.text.x = element_blank (),
axis.ticks.x = element_blank (),
plot.title = element_text (hjust = 0.5 ))
Code
SE_mg |>
as.tibble () |>
ggplot (aes (x= .feature, y = .sample, fill = is.na (relabs))) +
geom_tile () +
scale_x_discrete ("Species" ) +
scale_y_discrete ("Samples" ,breaks = NULL ) +
labs (title = "Missing values in relative abundance" ) +
scale_fill_manual (
name = "Statut" ,
values = c ("TRUE" = "pink" , "FALSE" = "steelblue2" ),
labels = c ("FALSE" = "Not missing" , "TRUE" = "Missing" )) +
theme (axis.text.x = element_blank (),
axis.ticks.x = element_blank (),
plot.title = element_text (hjust = 0.5 ))
Proportion of LBP strains per sample
Code
SE_mg |>
as.tibble () |>
filter (! is.na (Biose_ID)) |>
ggplot (aes (x = .feature, y = .sample, fill = relabs)) +
geom_tile () +
scale_fill_continuous (low = "white" , high = "steelblue2" ) +
scale_x_discrete ("Strains" ) +
scale_y_discrete ("Samples" ,breaks = NULL ) +
facet_grid (. ~ LBP + strain_origin, scales = "free_x" , space = "free" ) +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 ))
Co-occurrence analysis
Is there a strain that is only present when the others are not?
TODO : improve plot (same order of taxa, why there is an X in front of LBP names in y ??)
Code
presence_absence <-
SE_mg |>
as.tibble () |>
filter (! is.na (Biose_ID)) |>
select (.feature, .sample, relabs) |>
pivot_wider (names_from = .feature, values_from = relabs, values_fill = 0 ) |>
mutate (across (where (is.numeric), ~ ifelse (. > 0 , 1 , 0 ))) |>
select (- .sample)
cooccurrence <- presence_absence |>
summarise (across (everything (), sum)) |>
pivot_longer (cols = everything (), names_to = "taxon" , values_to = "presence_count" )
cooccurrence_matrix <- presence_absence |>
summarise (across (everything (), list)) |>
pivot_longer (cols = everything (), names_to = "taxon" , values_to = "presence_list" ) |>
mutate (presence_list = map (presence_list, unlist)) |>
pull (presence_list)
cooccur_mat = matrix (0 , nrow = length (cooccurrence_matrix), ncol = length (cooccurrence_matrix))
rownames (cooccur_mat) = colnames (presence_absence)
colnames (cooccur_mat) = colnames (presence_absence)
for (i in 1 : length (cooccurrence_matrix)){
for (j in 1 : length (cooccurrence_matrix)){
cooccur_mat[i,j] = sum (cooccurrence_matrix[[i]] & cooccurrence_matrix[[j]])
}
}
exclusive_strains <- data.frame (cooccur_mat) |>
mutate (strain = rownames (cooccur_mat)) |>
pivot_longer (! strain, names_to = "other_strain" , values_to = "cooccur" ) |>
group_by (strain) |>
summarise (total_cooccur = sum (cooccur)) |>
filter (total_cooccur == 0 )
print (exclusive_strains)
# A tibble: 0 × 2
# ℹ 2 variables: strain <chr>, total_cooccur <dbl>
Code
cooccur_melted = data.frame (cooccur_mat) |>
mutate (strain = rownames (cooccur_mat)) |>
pivot_longer (! strain, names_to = "other_strain" , values_to = "cooccur" )
ggplot (cooccur_melted, aes (x = strain, y = other_strain, fill = cooccur)) +
geom_tile () +
scale_fill_continuous (low = "white" , high = "steelblue2" ) +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 ))
Control sample
LBP abundance in control samples
Code
SE_mg |>
as.tibble () |>
filter (! is.na (Biose_ID))|>
filter (SampleType != "ClinicalSample" ) |>
ggplot (aes (x = .feature, y = .sample, fill = relabs)) +
geom_tile () +
scale_fill_continuous (low = "white" , high = "steelblue2" ) +
scale_x_discrete ("Strains" ) +
scale_y_discrete ("Samples" ) +
facet_grid (SampleType ~ LBP + strain_origin, scales = "free" , space = "free" ) +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 ))
Code
top20_species_all <-
SE_mg |>
as.tibble () |>
group_by (.feature) |>
summarise (total_relabs = sum (relabs)) |>
arrange (desc (total_relabs)) |>
slice_head (n = 20 ) |>
group_by (color = case_when (
.feature %in% LBP_strain_info$ strain_id ~ colorRampPalette (c ("lightpink" , "hotpink3" ))(n ()),
str_detect (.feature, "Lactobacillus" ) ~ colorRampPalette (c ("chocolate1" ,"chocolate4" ))(n ()),
str_detect (.feature, "Gardnerella" ) ~ colorRampPalette (c ("darkolivegreen1" , "darkolivegreen4" ))(n ()),
str_detect (.feature, "Prevotella" ) ~ colorRampPalette (c ("lightblue" , "deepskyblue3" ))(n ()),
TRUE ~ colorRampPalette (c ("antiquewhite" , "antiquewhite3" ))(n ())
))
top20_species_control <-
SE_mg |>
as.tibble () |>
filter (SampleType != "ClinicalSample" ) |>
group_by (.feature) |>
summarise (total_relabs = sum (relabs)) |>
arrange (desc (total_relabs)) |>
slice_head (n = 20 ) |>
group_by (color = case_when (
.feature %in% LBP_strain_info$ strain_id ~ colorRampPalette (c ("lightpink" , "hotpink3" ))(n ()),
str_detect (.feature, "Lactobacillus" ) ~ colorRampPalette (c ("chocolate1" ,"chocolate4" ))(n ()),
str_detect (.feature, "Gardnerella" ) ~ colorRampPalette (c ("darkolivegreen1" , "darkolivegreen4" ))(n ()),
str_detect (.feature, "Prevotella" ) ~ colorRampPalette (c ("lightblue" , "deepskyblue3" ))(n ()),
TRUE ~ colorRampPalette (c ("antiquewhite" , "antiquewhite3" ))(n ())
))
SE_mg |>
as.tibble () |>
filter (.feature %in% all_of (top20_species_all$ .feature)) |>
filter (SampleType != "ClinicalSample" ) |>
ggplot (aes (x = .feature, y = .sample, fill = relabs)) +
geom_tile () +
scale_fill_continuous (low = "white" , high = "steelblue2" ) +
scale_x_discrete ("Species" ) +
scale_y_discrete ("Samples" ) +
facet_grid (SampleType ~ ., scales = "free" , space = "free" ) +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 ))
Code
SE_mg |>
as.tibble () |>
filter (.feature %in% all_of (top20_species_all$ .feature)) |>
left_join (
top20_species_all |> select (.feature, color),
by = ".feature"
) |>
filter (SampleType != "ClinicalSample" ) |>
mutate (.feature = factor (.feature, levels = sort (unique (.feature)))) |>
ggplot (aes (x = .sample, y = relabs, fill = .feature)) +
geom_bar (stat = "identity" ) +
facet_wrap (~ SampleType, scales = "free_x" ) +
labs (
x = "Samples" ,
y = "Relative Abundance" ,
title = "Controle sample : Top 20 species of all samples" ,
fill = "Species"
) +
scale_x_discrete ("Samples" , breaks = NULL ) +
scale_fill_manual (
values = setNames (top20_species_all$ color, top20_species_all$ .feature),
breaks = sort (unique (top20_species_all$ .feature)),
labels = sort (unique (top20_species_all$ .feature))
) +
theme_bw () +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 ),
plot.title = element_text (hjust = 0.5 ))
Code
SE_mg |>
as.tibble () |>
filter (.feature %in% all_of (top20_species_control$ .feature)) |>
left_join (
top20_species_control |> select (.feature, color),
by = ".feature"
) |>
filter (SampleType != "ClinicalSample" ) |>
mutate (.feature = factor (.feature, levels = sort (unique (.feature)))) |>
ggplot (aes (x = .sample, y = relabs, fill = .feature)) +
geom_bar (stat = "identity" ) +
facet_wrap (~ SampleType, scales = "free_x" ) +
labs (
x = "Samples" ,
y = "Relative Abundance" ,
title = "Controle sample : Top 20 species of control samples" ,
fill = "Species"
) +
scale_x_discrete ("Samples" , breaks = NULL ) +
scale_fill_manual (
values = setNames (top20_species_control$ color, top20_species_control$ .feature),
breaks = sort (unique (top20_species_control$ .feature)),
labels = sort (unique (top20_species_control$ .feature))
) +
theme_bw () +
theme (axis.text.x = element_text (angle = 90 , hjust = 1 ),
plot.title = element_text (hjust = 0.5 ))
PCoA on counts
Code
dist_matrix <-
vegdist (
assay (SE_mg, "counts" ) |> t (),
method = "bray" )
pcoa <- wcmdscale (dist_matrix, eig = TRUE )
print (pcoa)
Call: wcmdscale(d = dist_matrix, eig = TRUE)
Inertia Rank
Total 359.39
Real 428.63 344
Imaginary -69.24 621
Results have 966 points, 344 axes
Eigenvalues:
[1] 60.97 50.39 33.62 21.71 15.98 13.65 12.50 10.89 9.51 8.84 8.08 6.79
[13] 6.25 6.08 5.51 4.77 4.59 4.39 4.38 4.21 4.07 3.73 3.61 3.32
[25] 3.15 3.07 2.94 2.81 2.69 2.58 2.46 2.45 2.32 2.27 2.20 2.10
[37] 2.04 1.96 1.87 1.80 1.71 1.62 1.56 1.54 1.52 1.44 1.40 1.38
[49] 1.32 1.29 1.24 1.21 1.18 1.17 1.12 1.10 1.08 1.07 1.05 1.03
[61] 1.00 0.97 0.95 0.93 0.89 0.88 0.87 0.85 0.82 0.80 0.79 0.77
[73] 0.75 0.74 0.72 0.72 0.71 0.68 0.67 0.66 0.64 0.63 0.62 0.61
[85] 0.60 0.59 0.58 0.57 0.55 0.54 0.53 0.52 0.51 0.51 0.50 0.49
[97] 0.48 0.48 0.47 0.46 0.45 0.45 0.44 0.43 0.42 0.42 0.41 0.41
[109] 0.40 0.40 0.39 0.39 0.38 0.38 0.37 0.36 0.36 0.35 0.34 0.34
(Showing 120 of 965 eigenvalues)
Weights: Constant
Code
pcoa$ eig |>
as.data.frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = str_remove (axis, "Eigenvalue" ),
var_explained = 100 * pcoa$ eig/ sum (pcoa$ eig)) |>
slice (1 : 10 ) |>
gt () |>
tab_header (title = "Eigenvalues of the PCoA" ) |>
cols_label (axis = "Axis" , ` pcoa$eig ` = "Eigenvalue" , var_explained = "Variance explained" ) |>
fmt_number (columns = vars (` pcoa$eig ` , var_explained), decimals = 3 )
Axis
Eigenvalue
Variance explained
1
60.975
16.966
2
50.394
14.022
3
33.616
9.353
4
21.708
6.040
5
15.980
4.446
6
13.648
3.798
7
12.496
3.477
8
10.893
3.031
9
9.509
2.646
10
8.836
2.459
Code
pcoa$ eig |>
as_data_frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = axis |> as.numeric ()) |>
ggplot (aes (x = axis, y = value)) +
geom_col () +
labs (
x= "Axis" ,
y = "Eigenvalue"
) +
theme_bw ()
Code
pcoa_data <-
as.data.frame (pcoa$ points) |>
mutate (sampleID = counts$ sampleID) |>
left_join (
technical_metadata |> filter (is.na (LibrarySequencedTwice)),
by = c ("sampleID" = "UID" )
)
Code
pcoa_plot <- function (pcoa, color_var, axes = 1 : 2 ){
pcoa_data <- pcoa_data |>
mutate (!! sym (color_var) : = as.factor (!! sym (color_var)))
pcoa_data |>
ggplot (aes (x = !! sym (paste0 ("Dim" , axes[1 ])), y = !! sym (paste0 ("Dim" , axes[2 ])), color = !! sym (color_var))) +
geom_point (size = 2 , alpha = 0.7 ) +
coord_fixed () +
labs (
x = paste0 ("PCoA " , axes[1 ], " (" , round (100 * pcoa$ eig[axes[1 ]] / sum (pcoa$ eig), 1 ), "%)" ),
y = paste0 ("PCoA " , axes[2 ], " (" , round (100 * pcoa$ eig[axes[2 ]] / sum (pcoa$ eig), 1 ), "%)" ),
color = color_var
) +
theme_bw ()
}
Type of sample
Code
pcoa_plot (pcoa, "SampleType" , c (1 , 2 ))
Code
pcoa_plot (pcoa, "SampleType" , c (3 , 4 ))
Code
pcoa_plot (pcoa, "SampleType" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix ~ SampleType,
data = pcoa_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix ~ SampleType, data = pcoa_data)
Df SumOfSqs R2 F Pr(>F)
SampleType 4 4.42 0.01229 2.989 0.001 ***
Residual 961 354.97 0.98771
Total 965 359.39 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Plate
Code
pcoa_plot (pcoa, "Ext_Lib_Plate" , c (1 , 2 ))
Code
pcoa_plot (pcoa, "Ext_Lib_Plate" , c (3 , 4 ))
Code
pcoa_plot (pcoa, "Ext_Lib_Plate" , c (5 , 6 ))
Permutation test
Code
manova <- adonis2 (
dist_matrix ~ Ext_Lib_Plate,
data = pcoa_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix ~ Ext_Lib_Plate, data = pcoa_data)
Df SumOfSqs R2 F Pr(>F)
Ext_Lib_Plate 1 4.57 0.01272 12.423 0.001 ***
Residual 964 354.82 0.98728
Total 965 359.39 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Batch
Code
pcoa_plot (pcoa, "BioinformaticsProcessingBatch" , c (1 , 2 ))
Code
pcoa_plot (pcoa, "BioinformaticsProcessingBatch" , c (3 , 4 ))
Code
pcoa_plot (pcoa, "BioinformaticsProcessingBatch" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix ~ BioinformaticsProcessingBatch,
data = pcoa_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix ~ BioinformaticsProcessingBatch, data = pcoa_data)
Df SumOfSqs R2 F Pr(>F)
BioinformaticsProcessingBatch 1 3.21 0.00894 8.6962 0.001 ***
Residual 964 356.18 0.99106
Total 965 359.39 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Library Pool
Code
pcoa_plot (pcoa, "Library Pool" , c (1 , 2 ))
Code
pcoa_plot (pcoa, "Library Pool" , c (3 , 4 ))
Code
pcoa_plot (pcoa, "Library Pool" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix ~ ` Library Pool ` ,
data = pcoa_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix ~ `Library Pool`, data = pcoa_data)
Df SumOfSqs R2 F Pr(>F)
`Library Pool` 17 16.13 0.04487 2.6199 0.001 ***
Residual 948 343.26 0.95513
Total 965 359.39 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Lane
Code
pcoa_plot (pcoa, "Lane" , c (1 , 2 ))
Code
pcoa_plot (pcoa, "Lane" , c (3 , 4 ))
Code
pcoa_plot (pcoa, "Lane" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix ~ Lane,
data = pcoa_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix ~ Lane, data = pcoa_data)
Df SumOfSqs R2 F Pr(>F)
Lane 1 0.66 0.00185 1.7847 0.037 *
Residual 964 358.73 0.99815
Total 965 359.39 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PCoA on relative abundances
Code
dist_matrix_relab <-
vegdist (
relabs |> dplyr:: select (- c (CST, subCST, score)) |> column_to_rownames ("sampleID" ),
method = "bray" )
pcoa_relab <- wcmdscale (dist_matrix_relab, eig = TRUE )
print (pcoa_relab)
Call: wcmdscale(d = dist_matrix_relab, eig = TRUE)
Inertia Rank
Total 309.98
Real 381.84 326
Imaginary -71.86 639
Results have 966 points, 326 axes
Eigenvalues:
[1] 71.59 59.11 39.07 19.66 15.30 12.04 10.41 8.62 7.37 6.53 5.77 5.39
[13] 5.24 4.78 4.31 4.24 3.83 3.75 3.24 3.15 2.98 2.79 2.68 2.57
[25] 2.24 2.16 2.08 1.94 1.77 1.73 1.59 1.54 1.53 1.41 1.36 1.35
[37] 1.28 1.25 1.18 1.16 1.12 1.09 1.05 1.03 0.98 0.96 0.93 0.91
[49] 0.88 0.87 0.82 0.80 0.79 0.78 0.76 0.75 0.74 0.70 0.69 0.67
[61] 0.66 0.64 0.63 0.62 0.59 0.56 0.56 0.55 0.54 0.52 0.51 0.49
[73] 0.49 0.49 0.48 0.47 0.46 0.45 0.44 0.43 0.42 0.41 0.41 0.40
[85] 0.39 0.38 0.38 0.37 0.36 0.36 0.35 0.35 0.34 0.33 0.33 0.32
[97] 0.31 0.30 0.30 0.29 0.29 0.28 0.28 0.27 0.26 0.26 0.26 0.26
[109] 0.25 0.24 0.24 0.24 0.24 0.23 0.23 0.23 0.22 0.22 0.21 0.21
(Showing 120 of 965 eigenvalues)
Weights: Constant
Code
pcoa_relab$ eig |>
as.data.frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = str_remove (axis, "Eigenvalue" ),
var_explained = 100 * pcoa$ eig/ sum (pcoa_relab$ eig)) |>
slice (1 : 10 ) |>
gt () |>
tab_header (title = "Eigenvalues of the PCoA" ) |>
cols_label (axis = "Axis" , ` pcoa_relab$eig ` = "Eigenvalue" , var_explained = "Variance explained" ) |>
fmt_number (columns = vars (` pcoa_relab$eig ` , var_explained), decimals = 3 )
Axis
Eigenvalue
Variance explained
1
71.593
19.670
2
59.108
16.257
3
39.074
10.844
4
19.664
7.003
5
15.305
5.155
6
12.043
4.403
7
10.414
4.031
8
8.623
3.514
9
7.367
3.068
10
6.533
2.850
Code
pcoa_relab$ eig |>
as_data_frame () |>
rownames_to_column ("axis" ) |>
mutate (axis = axis |> as.numeric ()) |>
ggplot (aes (x = axis, y = value)) +
geom_col () +
labs (
x= "Axis" ,
y = "Eigenvalue"
) +
theme_bw ()
Code
pcoa_relab_data <-
as.data.frame (pcoa_relab$ points) |>
mutate (sampleID = counts$ sampleID) |>
left_join (
technical_metadata |> filter (is.na (LibrarySequencedTwice)),
by = c ("sampleID" = "UID" )
)
Type of sample
Code
pcoa_plot (pcoa_relab, "SampleType" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab, "SampleType" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab, "SampleType" , c (1 , 2 ))
Plate
Code
pcoa_plot (pcoa_relab, "Ext_Lib_Plate" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab, "Ext_Lib_Plate" , c (3 , 4 ))
Code
pcoa_plot (pcoa_relab, "Ext_Lib_Plate" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix_relab ~ Ext_Lib_Plate,
data = pcoa_relab_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix_relab ~ Ext_Lib_Plate, data = pcoa_relab_data)
Df SumOfSqs R2 F Pr(>F)
Ext_Lib_Plate 1 4.345 0.01402 13.704 0.001 ***
Residual 964 305.639 0.98598
Total 965 309.984 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Batch
Code
pcoa_plot (pcoa_relab, "BioinformaticsProcessingBatch" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab, "BioinformaticsProcessingBatch" , c (3 , 4 ))
Code
pcoa_plot (pcoa_relab, "BioinformaticsProcessingBatch" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix_relab ~ BioinformaticsProcessingBatch,
data = pcoa_relab_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix_relab ~ BioinformaticsProcessingBatch, data = pcoa_relab_data)
Df SumOfSqs R2 F Pr(>F)
BioinformaticsProcessingBatch 1 3.212 0.01036 10.093 0.001 ***
Residual 964 306.772 0.98964
Total 965 309.984 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Library Pool
Code
pcoa_plot (pcoa_relab, "Library Pool" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab, "Library Pool" , c (3 , 4 ))
Code
pcoa_plot (pcoa_relab, "Library Pool" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix_relab ~ ` Library Pool ` ,
data = pcoa_relab_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix_relab ~ `Library Pool`, data = pcoa_relab_data)
Df SumOfSqs R2 F Pr(>F)
`Library Pool` 17 15.021 0.04846 2.8398 0.001 ***
Residual 948 294.963 0.95154
Total 965 309.984 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Lane
Code
pcoa_plot (pcoa_relab, "Lane" , c (1 , 2 ))
Code
pcoa_plot (pcoa_relab, "Lane" , c (3 , 4 ))
Code
pcoa_plot (pcoa_relab, "Lane" , c (5 , 6 ))
Code
manova <- adonis2 (
dist_matrix_relab ~ Lane,
data = pcoa_relab_data
)
manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = dist_matrix_relab ~ Lane, data = pcoa_relab_data)
Df SumOfSqs R2 F Pr(>F)
Lane 1 0.615 0.00198 1.9169 0.054 .
Residual 964 309.369 0.99802
Total 965 309.984 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Exclude failed samples)
…
Add relative abundances
assay_name = rel_ab
Additional SE with bacteria only counts and relative abundances
Take the counts assay from SE1; subset the feature that are bacteria; compute total_bacterial_reads and rel_ab
Save SummarizedExperiment objects
Save the SE objects to disk